#loading packages
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.5     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.0
## ✓ readr   1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
library(ggridges) # for joy plots
library(plotly) 
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(gganimate)     # for adding animation layers to ggplots
library(gifski)        # for creating the gif (don't need to load this library every time,but need it installed)
#loading data
spotify <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   track_id = col_character(),
##   track_name = col_character(),
##   track_artist = col_character(),
##   track_album_id = col_character(),
##   track_album_name = col_character(),
##   track_album_release_date = col_character(),
##   playlist_name = col_character(),
##   playlist_id = col_character(),
##   playlist_genre = col_character(),
##   playlist_subgenre = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
spotify_rap <- spotify %>% 
  filter(playlist_genre == "rap")

randb <- spotify %>%
  filter(playlist_genre == "r&b") %>%
  select(-track_id, - track_album_id, -playlist_id) %>%
  filter(track_popularity >= 75)

Introduction & Background

Why did we do an analysis on spotify? Why is the data significant & why should people care? In troduce the data to audience

prelim_graph <- spotify %>%
  ggplot(aes(y = playlist_genre, x = track_popularity)) +
  labs(title = "Song Popularity by Genre",
       x = "", y = "",
       subtitle = "Song popularity is measured from 0-100, with higher numbers being indiciative of more popularity.\nHighest median popularities belong to pop and latin with an overall median popularity of 40",
       caption = "Alex Ismail, Malek Kaloti, Brian Lee") +
  theme_classic() + 
  theme(plot.title.position = "plot",
        plot.title = element_text(size = 20, face = "bold"),
        plot.subtitle = element_text(size = 10, face = "italic")) +
  geom_boxplot() +
  geom_vline(aes(xintercept = median(track_popularity, na.rm = TRUE)), color = "blue") 

prelim_graph

feature_names <- names(spotify)[12:23]

density_plot <- spotify %>%
  select(c('playlist_genre', feature_names)) %>%
  pivot_longer(cols = feature_names) %>%
  ggplot(aes(x = value)) +
  geom_density(aes(color = playlist_genre), alpha = 0.5) +
  facet_wrap(~name, ncol = 3, scales = 'free') +
  labs(title = 'Spotify Audio Feature Density - by Genre',
       x = '', y = 'density') +
  theme(axis.text.y = element_blank())
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(feature_names)` instead of `feature_names` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
ggplotly(density_plot)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
spotify %>% 
  filter(track_popularity >= 75) %>%
ggplot(aes(x = track_popularity, y = playlist_genre)) +
  labs(x = "Popularity", y = "Playlist Genre") +
  geom_density_ridges() + 
  theme_ridges()
## Picking joint bandwidth of 1.37

#get rid of axes, add a more descriptive subtitle

Data Collection

Data retrieved from github, (add link).

Analysis!

Rap

spotify_rap %>% 
  mutate(Rounded_Danceability = round(danceability, digits = 1),
         Rounded_Energy = round(energy, digits = 1),
         Rounded_Speechiness = round(speechiness, digits = 1),
         Rounded_Instrumental = round(instrumentalness, digits = 1),
         popular = track_popularity > 75) %>% 
  pivot_longer(cols = starts_with("Rounded"),
               names_to = "Stat1",
               values_to = "Rounded_Value") %>% 
  group_by(Stat1, Rounded_Value) %>% 
  summarize(Pop_Rate = mean(popular)*100) %>% 
  mutate(Stat = fct_recode(Stat1, 
                             Danceability = "Rounded_Danceability",
                             Energy = "Rounded_Energy",
                             Speechiness = "Rounded_Speechiness",
                             Instrumental = "Rounded_Instrumental")) %>% 
  ggplot(aes(x = Rounded_Value, y = Pop_Rate)) +
  geom_line(aes(color = Stat)) +
  labs(title = "Popularity of Rap Songs by Song Characteristic",
       x = "", y = "Percent Popular", color = "Song Statistic") +
  theme_classic() + 
  theme(plot.title.position = "plot",
        plot.title = element_text(size = 20, face = "bold"),
        plot.subtitle = element_text(size = 10, face = "italic"))
## `summarise()` regrouping output by 'Stat1' (override with `.groups` argument)
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Energy, Rounded_Speechiness, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 1: Stat1 = "Rounded_Danceability".
## Warning: Unknown levels in `f`: Rounded_Energy, Rounded_Speechiness,
## Rounded_Instrumental
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Speechiness, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 2: Stat1 = "Rounded_Energy".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Speechiness,
## Rounded_Instrumental
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Energy, Rounded_Speechiness
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 3: Stat1 = "Rounded_Instrumental".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Energy,
## Rounded_Speechiness
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Energy, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 4: Stat1 = "Rounded_Speechiness".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Energy,
## Rounded_Instrumental

spotify %>% 
  mutate(track_name_lower = str_to_lower(track_name),
         remix = str_detect(track_name_lower, "Remix"),
         feature = str_detect(track_name_lower, "feat"),
         ma_prep = remix|feature,
         ma_prep2 = replace_na(ma_prep, FALSE),
         multiple_artists = if_else(ma_prep2, true = "Multiple Artists", false = "One Artist"),
         popular = track_popularity > 75) %>% 
  group_by(multiple_artists, playlist_genre) %>% 
  summarize(prop_pop = mean(popular)*100) %>% 
  mutate(genre = fct_relevel(playlist_genre, "rap")) %>% 
  ggplot() +
  geom_col(aes(x = multiple_artists, y = prop_pop)) +
  facet_wrap(~genre) +
  labs(title = "Popularity of Songs Containing Mulitple Artists Across Genre",
       x = "", y = "Percent of Songs Popular") +
  theme_classic() + 
  theme(plot.title.position = "plot",
        plot.title = element_text(size = 20, face = "bold"),
        plot.subtitle = element_text(size = 10, face = "italic"))
## `summarise()` regrouping output by 'multiple_artists' (override with `.groups` argument)

R&B

head(randb)
## # A tibble: 6 x 20
##   track_name track_artist track_popularity track_album_name track_album_rel…
##   <chr>      <chr>                   <dbl> <chr>            <chr>           
## 1 Life Is G… Future                     93 Life Is Good (f… 2020-01-10      
## 2 Ayy Macar… Tyga                       91 Ayy Macarena     2019-11-13      
## 3 HIGHEST I… Travis Scott               89 JACKBOYS         2019-12-27      
## 4 FML        Arizona Zer…               82 Living Facts     2018-06-03      
## 5 OUT WEST … JACKBOYS                   87 JACKBOYS         2019-12-27      
## 6 Out Of Yo… French Mont…               75 MONTANA          2019-12-06      
## # … with 15 more variables: playlist_name <chr>, playlist_genre <chr>,
## #   playlist_subgenre <chr>, danceability <dbl>, energy <dbl>, key <dbl>,
## #   loudness <dbl>, mode <dbl>, speechiness <dbl>, acousticness <dbl>,
## #   instrumentalness <dbl>, liveness <dbl>, valence <dbl>, tempo <dbl>,
## #   duration_ms <dbl>
randb %>% 
  ggplot(aes(x = track_popularity, fill = playlist_subgenre, color = playlist_subgenre)) +
  geom_density(alpha = 0.1) +
  labs(title = "ADD TITLE",
       subtitle = "R&B Subgenre: {closest_state}") +
  transition_states(playlist_subgenre, transition_length = 3, state_length = 1)
#get rid of axes, make subtitle descriptive
anim_save("randb_density.gif")
knitr::include_graphics("randb_density.gif")

Why do hip pop and urban contemp have such similar density curves? For this section I want to look at the features of these two genres specifically.

randb %>%
  group_by(playlist_subgenre) %>%
  filter(playlist_subgenre == c("hip pop", "urban contemporary")) %>%
  summarise_at(c("track_popularity", "danceability", "energy", "key", "loudness", "mode", "speechiness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms"), mean, na.rm = TRUE) %>%
  knitr::kable() 
## Warning in playlist_subgenre == c("hip pop", "urban contemporary"): longer
## object length is not a multiple of shorter object length

## Warning in playlist_subgenre == c("hip pop", "urban contemporary"): longer
## object length is not a multiple of shorter object length
playlist_subgenre track_popularity danceability energy key loudness mode speechiness instrumentalness liveness valence tempo duration_ms
hip pop 82.62411 0.6985887 0.6000979 5.014184 -6.380170 0.6808511 0.1304929 0.0120022 0.1580014 0.4780922 116.8704 200865.0
urban contemporary 81.98039 0.6823333 0.5401578 5.696078 -7.651382 0.4803922 0.1340971 0.0135849 0.1504039 0.4606735 121.0225 207035.3
# maybe somehow graph this??